library(STutility)
## Loading required package: Seurat
## Attaching SeuratObject
## Loading required package: ggplot2
## Registered S3 method overwritten by 'imager':
## method from
## plot.imlist
library(gprofiler2)
library(msigdbr)
library(GSA)
library(corrplot)
## corrplot 0.86 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Create infotable
infoTable <- data.frame(samples = c("/Users/ivarkaram/Downloads/Archive/V19T26-103_A1/filtered_feature_bc_matrix.h5", "/Users/ivarkaram/Downloads/Archive/V19T26-103_B1/filtered_feature_bc_matrix.h5", "/Users/ivarkaram/Downloads/Archive/V19T26-103_C1/filtered_feature_bc_matrix.h5", "/Users/ivarkaram/Downloads/Archive/V19T26-103_D1/filtered_feature_bc_matrix.h5"),
spotfiles = c("/Users/ivarkaram/Downloads/Archive/V19T26-103_A1/spatial/tissue_positions_list.csv", "/Users/ivarkaram/Downloads/Archive/V19T26-103_B1/spatial/tissue_positions_list.csv", "/Users/ivarkaram/Downloads/Archive/V19T26-103_C1/spatial/tissue_positions_list.csv", "/Users/ivarkaram/Downloads/Archive/V19T26-103_D1/spatial/tissue_positions_list.csv"),
imgs = c("/Users/ivarkaram/Downloads/Archive/V19T26-103_A1/spatial/tissue_hires_image.png", "/Users/ivarkaram/Downloads/Archive/V19T26-103_B1/spatial/tissue_hires_image.png",
"/Users/ivarkaram/Downloads/Archive/V19T26-103_C1/spatial/tissue_hires_image.png",
"/Users/ivarkaram/Downloads/Archive/V19T26-103_D1/spatial/tissue_hires_image.png"),
json = c("/Users/ivarkaram/Downloads/Archive/V19T26-103_A1/spatial/scalefactors_json.json",
"/Users/ivarkaram/Downloads/Archive/V19T26-103_B1/spatial/scalefactors_json.json", "/Users/ivarkaram/Downloads/Archive/V19T26-103_C1/spatial/scalefactors_json.json", "/Users/ivarkaram/Downloads/Archive/V19T26-103_D1/spatial/scalefactors_json.json"),
section.name = c("A1", "B1", "C1", "D1"),
stringsAsFactors = FALSE)
Create seurat object
se <- InputFromTable(infotable = infoTable,
platform = "Visium")
## Using spotfiles to remove spots outside of tissue
## Loading /Users/ivarkaram/Downloads/Archive/V19T26-103_A1/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## Loading /Users/ivarkaram/Downloads/Archive/V19T26-103_B1/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## Loading /Users/ivarkaram/Downloads/Archive/V19T26-103_C1/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## Loading /Users/ivarkaram/Downloads/Archive/V19T26-103_D1/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
##
## ------------- Filtering (not including images based filtering) --------------
## Spots removed: 0
## Genes removed: 7495
## Saving capture area ranges to Staffli object
## After filtering the dimensions of the experiment is: [26043 genes, 16194 spots]
se
## An object of class Seurat
## 26043 features across 16194 samples within 1 assay
## Active assay: RNA (26043 features, 0 variable features)
Start with plotting the raw counts.
Clear linear relationship between the number of unique genes as a function of the number of RNA molecules. This is also motivation to perform normalization
FeatureScatter(se, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "section.name")
By using the ST.FeaturePlot function we can start exploring how much RNA was captured in the tissue and the unique number of RNA in the tissue for all of the spots across every slide. Let’s visualize that.
ST.FeaturePlot(se, features = c('nCount_RNA', 'nFeature_RNA'), dark.theme = F, cols = c("lightgray", "mistyrose", "red", "dark red", "black"), ncol = 2)
Filtering could have been applied when creating the seurat object above by passing in all or any of the following parameters:
–>min.gene.count, which removes all genes in the tissue below a certain threshold. –>min.gene.spots, which sets a threshold of a minimum number of spots where genes are detected in. –>min.spot.feature.count, which sets a threshold of the least number of unique genes in a spot. –>min.spot.count, which sets a threshold of the least number of counts in a spot. –>topN, which only retains the n most expressed genes in the tissue.
However, it would be very naive to filter the data without exploring it more in depth to understand the actual distribution of some of these parameters in the data.
It would be interesting to explore the number of unique genes/spot and total number of counts/spot in our tissue.
p1 <- ggplot(data = se[[]], aes(nFeature_RNA)) +
geom_histogram(aes(fill=..count..), color='black', bins = 50, alpha=0.9) +
ggtitle("Unique genes per spot")
p2 <- ggplot(data = se[[]], aes(nCount_RNA)) +
geom_histogram(aes(fill=..count..), color='black', bins = 50, alpha=0.9) +
ggtitle("Total counts per spots")
p1
p2
We can also visualize the distribution with violin-plots.
VlnPlot(se, features = c("nFeature_RNA", "nCount_RNA"), group.by = "section.name")
We can see in the first histogram representing the unique genes/spot that we seem to have a number of spots with very few unique genes. We can plot a vertical line to understand how many unique genes there are in these spots.
p1 + geom_vline(xintercept = 180, linetype='dashed')
Want to look for variation in these and outliers and ultimately decide whether they should be kept or not as they would affect downstream analysis somewhat
mt.genes <- grep(pattern = "^MT-", x = rownames(se), value = TRUE)
se$percent.mito <- (Matrix::colSums(se@assays$RNA@counts[mt.genes, ])/Matrix::colSums(se@assays$RNA@counts))*100
# Collect all genes coding for ribosomal proteins
rp.genes <- grep(pattern = "^RPL|^RPS", x = rownames(se), value = TRUE)
se$percent.ribo <- (Matrix::colSums(se@assays$RNA@counts[rp.genes, ])/Matrix::colSums(se@assays$RNA@counts))*100
FeatureScatter(se, feature1 = "nFeature_RNA", feature2 = "percent.mito", group.by = "section.name")
FeatureScatter(se, feature1 = "nFeature_RNA", feature2 = "percent.ribo", group.by = "section.name")
VlnPlot(object = se, features = "percent.mito", group.by = "section.name") + NoLegend()
VlnPlot(object = se, features = "percent.ribo", group.by = "section.name") + NoLegend()
Based on quality check it could be a good idea to remove spots with less than 180 unique genes. In addition, to remove the effect of downstream analysis both mitochondrial and ribosomal genes are removed from the meta data.
se.subset <- SubsetSTData(se, expression = nFeature_RNA > 180)
genes <- rownames(se)
keep.genes <- genes[!(grepl("^(MT-|RPL|RPS)",genes))]
se.subset <- SubsetSTData(object = se.subset, features = keep.genes)
cat("Spots removed: ", ncol(se) - ncol(se.subset))
## Spots removed: 72
cat("\n")
cat("Genes removed: ", nrow(se) - nrow(se.subset))
## Genes removed: 110
se.subset <- LoadImages(se.subset, time.resolve = F, verbose = T, xdim = 2000)
## Loading images for 4 samples:
## Reading /Users/ivarkaram/Downloads/Archive/V19T26-103_A1/spatial/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 2000x1916 pixels to 2000x1916 pixels
## Reading /Users/ivarkaram/Downloads/Archive/V19T26-103_B1/spatial/tissue_hires_image.png for sample 1 ...
## Scaling down sample 2 image from 2000x1937 pixels to 2000x1937 pixels
## Reading /Users/ivarkaram/Downloads/Archive/V19T26-103_C1/spatial/tissue_hires_image.png for sample 1 ...
## Scaling down sample 3 image from 2000x1937 pixels to 2000x1937 pixels
## Reading /Users/ivarkaram/Downloads/Archive/V19T26-103_D1/spatial/tissue_hires_image.png for sample 1 ...
## Scaling down sample 4 image from 2000x1937 pixels to 2000x1937 pixels
se.subset <- MaskImages(object = se.subset)
ImagePlot(se.subset, ncols = 2, method = "raster", type = "raw", darken = F)
Why? To remove technical effects (linear relationship), for example compare the distribution of a gene across a tissue (scaled vs raw counts) would be wrong as it will not take into account the sequencing depth
se.subset <- SCTransform(se.subset, verbose = FALSE)
Explore the the distribution of the 2 main principal components using violin plots for each of the sections to investigate whether we would need to regress out the sections to make them more comparable.
se.subset <- RunPCA(se.subset, verbose = FALSE)
VlnPlot(se.subset, features = "PC_1", group.by = "section.name") +NoLegend()
VlnPlot(se.subset, features = "PC_2", group.by = "section.name")+NoLegend()
The choice of numbers of factors are quite arbitary. However, it is good to consider the library complexity and manually check whether the driver genes makes sense.
se.subset_5 <- RunNMF(se.subset, nfactors = 10)
Check correlation of factors to see if there are any relationship between them
cor.data_5 <- cor(se.subset_5[["NMF"]]@feature.loadings, method = "pearson")
cscale <- colorRampPalette(c("white", "lightblue", "darkred"))
corrplot(cor.data_5,method="color", cl.lim = c(0,1), col = cscale(200))
Check driver genes of each factor
len_factor <- length(se.subset_5[["NMF"]])
for (i in (1:len_factor)){
plt<-FactorGeneLoadingPlot(se.subset_5, factor = i, topn = 10, dark.theme = F) + theme(axis.text.y = element_text(face="bold", color="#993333", size=14),
axis.text.x = element_text(face="bold", color="#993333", size=14), axis.title = element_blank())
print(plt)
}
Plot the spatial distribution of the 10 factors
n_nmf <- length(se.subset_5[['NMF']])
for (i in 1:n_nmf){
print(ST.DimPlot(se.subset_5,
dims = i,
ncol = 2, # Sets the number of columns at dimensions level
grid.ncol = 1, # Sets the number of columns at sample level
reduction = "NMF",
dark.theme = F,
pt.size = 1,
center.zero = F,
palette="Spectral",
show.sb = FALSE) + ggtitle(" "))
}
Make a function, “genes_factor”, which takes a seurat object and number as input, and returns the gene names of the top n number of driver genes from each of the factors.
genes_factor <- function(seurat.object, n){
factor <- c()
for (i in 1:length(seurat.object[['NMF']])){
factor_n <- names(sort((seurat.object[["NMF"]]@feature.loadings[,i]), decreasing = TRUE)[1:n])
gene_list <- list(factor_n)
factor <- c(factor, gene_list)
}
return(factor)
}
Select 20 top driver genes from each factor
factor <- genes_factor(se.subset_5, 20)
factor[[1]]
## [1] "MDK" "BCAM" "WFDC2" "GAPDH" "NDUFA13" "CD9"
## [7] "ITM2C" "KRT7" "KRT8" "LAPTM4B" "SPINT2" "CLDN4"
## [13] "SLPI" "MUC1" "MSLN" "GPI" "FAM171A2" "IFI6"
## [19] "IFI27" "EPCAM"
##Pathway analysis We continue with pathway analysis which is a great tool for understanding and annotating the functions of our genes in a broader context.
Here gprofiler2 will be used to perform over representation analysis with GO:BP, hallmarks and
One issue with ORA is the fact that all genes are ranked the same when performing pathway enrichment. To circumvent this issue somewhat the argument ordered_query is set to TRUE. Thus each gene will be added iteratively, before pathway enrichment will be performed for that list of gene. The most significantly enriched will be stored.
len_factor <- length(se.subset_5[["NMF"]])
for (i in 1:len_factor){
enrichment <- gost(query = factor[[i]], ordered_query = TRUE, organism = 'hsapiens', correction_method = 'fdr', user_threshold = 0.01, multi_query = TRUE)
plt <- gostplot(enrichment, capped = FALSE, interactive = TRUE)
print(plt)
}
Problem, for almost all pathways we get a massive output that is hard to summarize. We need to simplify this a bit.
Redo pathway analysis but only enrich for GO:BP and keep the top 3 enriched terms.
df <- data.frame()
for (i in 1:len_factor){
enrichment <- gost(query = factor[[i]], ordered_query = TRUE, organism = 'hsapiens', correction_method = 'fdr', user_threshold = 0.01, source="GO:BP")
enrichment$result <- mutate(enrichment$result, factor = i) # add the factor as column value
df <- rbind(df, enrichment$result[,c("term_name", "p_value", "factor")][1:3,])
}
log10_pvalue <- -log10(unlist(df$p_value))
df <- cbind(df, log10_pvalue)
df$term_name <- toupper(df$term_name)
Can also perform enrichment for hallmarks of cancer. Upload a downloaded gmt file to ggprofiler
gprofiler2::upload_GMT_file(gmtfile ="/Users/ivarkaram/Downloads/h.all.v7.2.symbols.gmt" ) # enter filepath
Perform pathway enrichment, print the need to check which are annotated to the hallmarks
for (i in 1:len_factor){
print(i)
enrichment.cancer <- gost(query = factor[[i]], ordered_query = TRUE, correction_method = 'fdr', user_threshold = 0.01, organism = "gp__090C_YOUN_19Y")
}
## [1] 1
## Detected custom GMT source request
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## [1] 2
## Detected custom GMT source request
## [1] 3
## Detected custom GMT source request
## [1] 4
## Detected custom GMT source request
## [1] 5
## Detected custom GMT source request
## [1] 6
## Detected custom GMT source request
## [1] 7
## Detected custom GMT source request
## [1] 8
## Detected custom GMT source request
## [1] 9
## Detected custom GMT source request
## [1] 10
## Detected custom GMT source request
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
Only enrich with annotated factors. Store the result in a common data frame
hallmarks <- c(2:9)
hallmark_df <- data.frame() # make an empty dataframe
for (i in hallmarks){
enrichment.cancer <- gost(query = factor[[i]], ordered_query = TRUE, correction_method = 'fdr', user_threshold = 0.01, organism = "gp__090C_YOUN_19Y")
enrichment.cancer$result <- mutate(enrichment.cancer$result, factor = i)
hallmark_df <- rbind(hallmark_df, enrichment.cancer$result[, c("p_value", "term_id","factor")])
}
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
## Detected custom GMT source request
log10_pvalue <- -log10(hallmark_df$p_value)
hallmark_df <- cbind(hallmark_df, log10_pvalue)
hallmark_df$term_id <- gsub(pattern = "_", replacement = " ", x = hallmark_df$term_id)
hallmark_df <- hallmark_df %>% rename(term_name = term_id)
Bind the two dataframes before plotting
combined_df <- dplyr::bind_rows(hallmark_df, df)
Visualize
plt <- ggplot(combined_df, aes(x = factor, y = term_name)) +
geom_point(aes(color=term_name,size=log10_pvalue)) +
scale_x_continuous(limits = c(1,10), breaks = c(0:10))+
theme_classic()+
theme(legend.position="right", axis.text = element_text(size=12), axis.title.x = element_text(size=12), axis.title.y = element_blank(), legend.text = element_text(size=12),
legend.title = element_text(size = 12)) +
xlab(label = "Factors")+
guides(color = FALSE, size=guide_legend("-log10 (FDR p-value)"))
plt